library(here)
## here() starts at /home/peter/SCORE/C075_Grant_Coultas
sce <- readRDS(here("data/SCEs/C075_Grant_Coultas.scPipe.SCE.rds"))

library(scater)
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## Loading required package: BiocParallel
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum
## Loading required package: ggplot2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:matrixStats':
## 
##     count
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
as.data.frame(colData(sce)) %>%
  group_by(plate_number, cell_type_descriptor, sample_name) %>%
  count() %>%
  knitr::kable()
plate_number cell_type_descriptor sample_name n
LC279 Control sample # 1577 20 cells 3
LC279 Control sample # 1577 single cell 185
LC279 Mutant sample # 1576 20 cells 3
LC279 Mutant sample # 1576 single cell 185
LC280 Control sample # 1577 20 cells 3
LC280 Control sample # 1577 single cell 185
LC280 Mutant sample # 1576 20 cells 3
LC280 Mutant sample # 1576 single cell 185
LC294 Control sample # 1611 20 cells 3
LC294 Control sample # 1611 single cell 185
LC294 Mutant sample # 1609 20 cells 3
LC294 Mutant sample # 1609 single cell 185
LC358 Control sample # 1696 20 cells 3
LC358 Control sample # 1696 single cell 184
LC358 Mutant sample # 1695 20 cells 3
LC358 Mutant sample # 1695 single cell 182
LC392 Control sample # 1743 20 cells 3
LC392 Control sample # 1743 single cell 185
LC392 Mutant sample # 1747 20 cells 3
LC392 Mutant sample # 1747 single cell 185
LC396 Control sample # 1824 20 cells 2
LC396 Control sample # 1824 single cell 112
LC396 Mutant sample # 1819 20 cells 3
LC396 Mutant sample # 1819 single cell 185
LC398 Control sample # 1824 20 cells 2
LC398 Control sample # 1824 single cell 55
LC398 Mutant sample # 1823 20 cells 3
LC398 Mutant sample # 1823 single cell 185
library(EnsDb.Mmusculus.v79)
## Loading required package: ensembldb
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: AnnotationFilter
## 
## Attaching package: 'ensembldb'
## The following object is masked from 'package:dplyr':
## 
##     filter
## The following object is masked from 'package:stats':
## 
##     filter
ensembl <- gsub("\\.[0-9]+$", "", rownames(sce))
symb <- mapIds(
  x = EnsDb.Mmusculus.v79,
  # NOTE: Need to remove gene version number prior to lookup.
  keys = ensembl,
  keytype = "GENEID",
  column = "SYMBOL")
## Warning: Unable to map 6238 of 34297 requested IDs.
rowData(sce)$ENSEMBL <- ensembl
rowData(sce)$SYMBOL <- symb
# Replace the row names of the SCE by the gene symbols (where available).
library(scater)
rownames(sce) <- uniquifyFeatureNames(rownames(sce), rowData(sce)$SYMBOL)
# Add chromosome location so we can filter on mitochondrial genes.
location <- mapIds(
  x = EnsDb.Mmusculus.v79,
  # NOTE: Need to remove gene version number prior to lookup.
  keys = rowData(sce)$ENSEMBL,
  keytype = "GENEID",
  column = "SEQNAME")
## Warning: Unable to map 6238 of 34297 requested IDs.
rowData(sce)$CHR <- location

is_mito <- rowData(sce)$CHR == "MT"
sce <- addPerCellQC(sce, subsets = list(Mt = which(is_mito)))

sce$sample <- paste0(
  sce$plate_number,
  ".",
  stringr::str_extract(sce$cell_type_descriptor, "# [0-9]+"))
sce$group <- ifelse(grepl("Mutant", sce$cell_type_descriptor), "mutant", "control")
plotColData(
  sce,
  y = "sum",
  x = "sample",
  colour_by = "group",
  other_fields = "sample_name") +
  scale_y_log10() +
  facet_grid(~sample_name) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

plotColData(
  sce,
  y = "detected",
  x = "sample",
  colour_by = "group",
  other_fields = "sample_name") +
  scale_y_log10() +
  facet_grid(~sample_name) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

plotColData(
  sce,
  y = "altexps_ERCC_percent",
  x = "sample",
  colour_by = "group",
  other_fields = "sample_name") +
  ylim(0, 100) +
  facet_grid(~sample_name) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

plotColData(
  sce,
  y = "subsets_Mt_percent",
  x = "sample",
  colour_by = "group",
  other_fields = "sample_name") +
  ylim(0, 100) +
  facet_grid(~sample_name) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

p <- lapply(sort(unique(sce$plate_number)), function(p) {
  z <- sce[, sce$plate_number == p]
  plotPlatePosition(
    z,
    as.character(z$well_position),
    colour_by = "altexps_ERCC_percent",
    point_size = 2,
    point_alpha = 1,
    theme_size = 5,
    shape_by = "group",
    by_show_single = TRUE) +
    ggtitle(p) +
    scale_colour_viridis_c(
      limits = c(0, 60),
      breaks = seq(0, 60, 10)) +
    scale_shape_manual(values = c(control = 16, mutant = 17)) +
    guides(shape = FALSE)
})
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
multiplot(plotlist = p, cols = 3)

# Check expression of mutant gene (Kat7)
plotExpression(
  sce,
  "Kat7",
  x = "sample",
  exprs_values = "counts",
  colour_by = "group") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

table(sce$cell_type_descriptor, counts(sce)["Kat7", ] > 0)
##                        
##                         FALSE TRUE
##   Control sample # 1577   342   34
##   Control sample # 1611   157   31
##   Control sample # 1696   156   31
##   Control sample # 1743   154   34
##   Control sample # 1824   143   28
##   Mutant sample # 1576    354   22
##   Mutant sample # 1609    178   10
##   Mutant sample # 1695    165   20
##   Mutant sample # 1747    170   18
##   Mutant sample # 1819    182    6
##   Mutant sample # 1823    182    6
sce <- sce[, sce$sample_name == "single cell"]
sce$sample <- factor(sce$sample)
sce$sample_name <- factor(sce$sample_name)

libsize_drop <- isOutlier(
  metric = sce$sum,
  nmads = 3,
  type = "lower",
  log = TRUE)
feature_drop <- isOutlier(
  metric = sce$detected,
  nmads = 3,
  type = "lower",
  log = TRUE)
spike_drop <- isOutlier(
  metric = sce$altexps_ERCC_percent,
  nmads = 5,
  type = "higher")
sce_pre_QC_outlier_removal <- sce
sce <- sce[, !(libsize_drop | feature_drop | spike_drop)]
data.frame(
  ByLibSize = tapply(
    libsize_drop,
    sce_pre_QC_outlier_removal$plate_number,
    sum),
  ByFeature = tapply(
    feature_drop,
    sce_pre_QC_outlier_removal$plate_number,
    sum),
  BySpike = tapply(
    spike_drop,
    sce_pre_QC_outlier_removal$plate_number,
    sum),
  Remaining = as.vector(unname(table(sce$plate_number)))) %>%
  knitr::kable(
    caption = "Number of samples removed by each QC step and the number of samples remaining.")
Number of samples removed by each QC step and the number of samples remaining.
ByLibSize ByFeature BySpike Remaining
LC279 44 59 1 311
LC280 3 6 0 364
LC294 2 4 0 366
LC358 1 2 0 364
LC392 2 2 0 368
LC396 8 10 3 286
LC398 27 34 45 187
cbind(
  "pre-QC" = table(sce_pre_QC_outlier_removal$cell_type_descriptor),
  "post-QC" = table(sce$cell_type_descriptor))
##                       pre-QC post-QC
## Control sample # 1577    370     363
## Control sample # 1611    185     182
## Control sample # 1696    184     182
## Control sample # 1743    185     184
## Control sample # 1824    167     161
## Mutant sample # 1576     370     312
## Mutant sample # 1609     185     184
## Mutant sample # 1695     182     182
## Mutant sample # 1747     185     184
## Mutant sample # 1819     185     177
## Mutant sample # 1823     185     135
plotHighestExprs(sce, n = 50)

plotHighestExprs(sce[!grepl("^mt|^Rpl|^Rps", rownames(sce))], n = 50)

ave_counts <- calcAverage(sce, use_size_factors = FALSE)
## Warning: 'calcAverage' is deprecated.
## Use 'calculateAverage' instead.
## See help("Deprecated")
## Warning: 'use_size_factors=FALSE' is deprecated.
## See help("Deprecated")
par(mfrow = c(1, 1))
hist(
  x = log10(ave_counts),
  breaks = 100,
  main = "",
  col = "grey",
  xlab = expression(Log[10] ~ "average count"))

# NOTE: I've opted to filter out genes with zero counts across all data sets
#       rather than a per-data set basis. This means that each data set continues
#       to have the same set of genes after this filter is applied.
to_keep <- ave_counts > 0
table(to_keep)
## to_keep
## FALSE  TRUE 
##   417 33880
sce <- sce[to_keep, ]

library(scran)
set.seed(1011220)
clusters <- quickCluster(
  sce,
  min.size = 60,
  use.ranks = FALSE,
  BSPARAM = BiocSingular::IrlbaParam())
table(clusters)
## clusters
##   1   2   3   4   5   6   7   8   9 
## 462 354 441 338  91 142 154 175  89
sce <- computeSumFactors(
  sce,
  clusters = clusters,
  min.mean = 0.1)
summary(sizeFactors(sce))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1618  0.6032  0.9084  1.0000  1.2747  5.1049
xlim <- c(1, max(sce$sum) / 1e3)
ylim <- range(sizeFactors(sce))
par(mfrow = c(3, 3))
lapply(sort(unique(sce$plate_number)), function(p) {
  sce <- sce[, sce$plate_number == p]
  plot(
    x = sce$sum / 1e3,
    y = sizeFactors(sce),
    log = "xy",
    xlab = "Library size (thousands)",
    ylab = "Size factor",
    main = p,
    xlim = xlim,
    ylim = ylim,
    pch = 16)
})
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
sce <- computeSpikeFactors(sce, spikes = "ERCC", general.use = FALSE)
sce <- logNormCounts(sce)

var.out <- modelGeneVarWithSpikes(sce, "ERCC")
par(mfrow = c(1, 1))

plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression",
     ylab="Variance of log-expression")
fit <- metadata(var.out)
points(fit$mean, fit$var, col="red", pch=16)
curve(fit$trend(x), col="dodgerblue", add=TRUE, lwd=2)

chosen.genes <- order(var.out$bio, decreasing=TRUE)[1:10]
plotExpression(sce, rownames(var.out)[chosen.genes])

set.seed(1000)
sce <- denoisePCA(sce, technical=fit$trend, BSPARAM=BiocSingular::IrlbaParam())
ncol(reducedDim(sce, "PCA"))
## [1] 5
set.seed(1000)
sce <- runTSNE(sce, use_dimred="PCA", perplexity=50)
## Warning: 'use_dimred' is deprecated.
## Use 'dimred' instead.
## See help("Deprecated")
plotTSNE(sce, colour_by = "group")

plotTSNE(sce, colour_by = "cell_type_descriptor")

plotTSNE(sce, colour_by = "plate_number")

plotTSNE(sce, colour_by = "subsets_Mt_percent")

# TODO: Figure out if its MT, X, and/or Y that's causing the clustering.
# NOTE: Remove MT, X, and Y and re-do t-SNE.
sce <- sce[which(!rowData(sce)$CHR %in% c("MT", "X", "Y")), ]
# sce <- sce[which(!rowData(sce)$CHR %in% c("MT")), ]
set.seed(1011220)
clusters <- quickCluster(
  sce,
  min.size = 60,
  use.ranks = FALSE,
  BSPARAM = BiocSingular::IrlbaParam())
table(clusters)
## clusters
##   1   2   3   4   5   6   7   8   9  10  11 
## 403 233 351 209 196  95 349  83  96 149  82
sce <- computeSumFactors(
  sce,
  clusters = clusters,
  min.mean = 0.1)
summary(sizeFactors(sce))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1655  0.6099  0.9091  1.0000  1.2696  5.3658
xlim <- c(1, max(sce$sum) / 1e3)
ylim <- range(sizeFactors(sce))
par(mfrow = c(3, 3))
lapply(sort(unique(sce$plate_number)), function(p) {
  sce <- sce[, sce$plate_number == p]
  plot(
    x = sce$sum / 1e3,
    y = sizeFactors(sce),
    log = "xy",
    xlab = "Library size (thousands)",
    ylab = "Size factor",
    main = p,
    xlim = xlim,
    ylim = ylim,
    pch = 16)
})
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
sce <- computeSpikeFactors(sce, spikes = "ERCC", general.use = FALSE)
sce <- logNormCounts(sce)

var.out <- modelGeneVarWithSpikes(sce, "ERCC")
par(mfrow = c(1, 1))

plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression",
     ylab="Variance of log-expression")
fit <- metadata(var.out)
points(fit$mean, fit$var, col="red", pch=16)
curve(fit$trend(x), col="dodgerblue", add=TRUE, lwd=2)

chosen.genes <- order(var.out$bio, decreasing=TRUE)[1:10]
plotExpression(sce, rownames(var.out)[chosen.genes])

set.seed(1000)
sce <- denoisePCA(sce, technical=fit$trend, BSPARAM=BiocSingular::IrlbaParam())
ncol(reducedDim(sce, "PCA"))
## [1] 5
set.seed(1000)
sce <- runTSNE(sce, use_dimred="PCA", perplexity=50)
## Warning: 'use_dimred' is deprecated.
## Use 'dimred' instead.
## See help("Deprecated")
plotTSNE(sce, colour_by = "group")

plotTSNE(sce, colour_by = "cell_type_descriptor")

plotTSNE(sce, colour_by = "plate_number")

plotTSNE(sce, colour_by = "subsets_Mt_percent")

snn.gr <- buildSNNGraph(sce, use.dimred="PCA")
cluster.out <- igraph::cluster_walktrap(snn.gr)
my.clusters <- cluster.out$membership
table(my.clusters)
## my.clusters
##   1   2   3   4   5   6   7   8   9  10 
## 152 283 193 415 401 392  94 151 135  30
sce$cluster <- factor(my.clusters)
plotTSNE(sce, colour_by="cluster")

library(SingleR)
mouse_se <- MouseRNAseqData()
## snapshotDate(): 2019-10-22
## see ?SingleR and browseVignettes('SingleR') for documentation
## loading from cache
## see ?SingleR and browseVignettes('SingleR') for documentation
## loading from cache
as.data.frame(colData(mouse_se)) %>%
  tabyl(label.main) %>%
  adorn_pct_formatting(1) %>%
  knitr::kable(
    caption = "Summary of main cell types in the `MouseRNAseqData` reference set.")
Summary of main cell types in the MouseRNAseqData reference set.
label.main n percent
Adipocytes 13 3.6%
Astrocytes 27 7.5%
B cells 5 1.4%
Cardiomyocytes 8 2.2%
Dendritic cells 2 0.6%
Endothelial cells 12 3.4%
Epithelial cells 2 0.6%
Erythrocytes 3 0.8%
Fibroblasts 45 12.6%
Granulocytes 15 4.2%
Hepatocytes 4 1.1%
Macrophages 32 8.9%
Microglia 72 20.1%
Monocytes 6 1.7%
Neurons 64 17.9%
NK cells 18 5.0%
Oligodendrocytes 12 3.4%
T cells 18 5.0%
rmarkdown::paged_table(
  as.data.frame(colData(mouse_se)) %>%
    dplyr::count(label.main, label.fine) %>%
    dplyr::arrange(label.main))
pred_mouse_cell_main <- SingleR(
  test = sce,
  ref = mouse_se,
  labels = mouse_se$label.main,
  BPPARAM = SerialParam())
tabyl(data.frame(label.main = pred_mouse_cell_main$labels), label.main) %>%
  adorn_pct_formatting(digits = 1) %>%
  dplyr::arrange(desc(n)) %>%
  knitr::kable(
    caption = "Cell label assignments using the main labels of the `MouseRNAseqData` reference data.")
Cell label assignments using the main labels of the MouseRNAseqData reference data.
label.main n percent
Endothelial cells 2244 99.9%
B cells 1 0.0%
Macrophages 1 0.0%
stopifnot(identical(rownames(pred_mouse_cell_main), colnames(sce)))
plotScoreHeatmap(
  pred_mouse_cell_main,
  annotation_col = data.frame(
    cluster = sce$cluster,
    sample = sce$cell_type_descriptor,
    row.names = rownames(pred_mouse_cell_main)))

pred_mouse_cell_fine <- SingleR(
  test = sce,
  ref = mouse_se,
  labels = mouse_se$label.fine,
  BPPARAM = SerialParam())
tabyl(data.frame(label.fine = pred_mouse_cell_fine$labels), label.fine) %>%
  adorn_pct_formatting(digits = 1) %>%
  dplyr::arrange(desc(n)) %>%
  knitr::kable(
    caption = "Cell label assignments using the fine labels of the `MouseRNAseqData` reference data.")
Cell label assignments using the fine labels of the MouseRNAseqData reference data.
label.fine n percent
Endothelial cells 2244 99.9%
Hepatocytes 2 0.1%
stopifnot(identical(rownames(pred_mouse_cell_main), colnames(sce)))
plotScoreHeatmap(
  pred_mouse_cell_fine,
  annotation_col = data.frame(
    cluster = sce$cluster,
    sample = sce$cell_type_descriptor,
    row.names = rownames(pred_mouse_cell_fine)))

genes_of_interest <- c(
  "Col4a1", "Col4a2", "Epas1", "Cdh5", "Ptprb", "Pecam1", "Vwf", "Itgb1",
  "Calcrl", "Plvap", "Tie1", "Cldn5", "Acvrl1", "Eng", "Kdr", "Kat7")
plotExpression(sce, genes_of_interest, x = "cell_type_descriptor", colour_by = "group")

data.frame(
  gene = genes_of_interest,
  median_expression = signif(rowMedians(as.matrix(logcounts(sce)[genes_of_interest, ])), 3)) %>%
  dplyr::arrange(desc(median_expression)) %>%
  knitr::kable()
gene median_expression
Col4a1 4.480
Cldn5 4.410
Col4a2 3.880
Kdr 3.290
Eng 2.710
Itgb1 2.670
Epas1 2.640
Tie1 2.580
Cdh5 2.520
Ptprb 2.510
Pecam1 2.480
Plvap 1.010
Vwf 0.995
Acvrl1 0.812
Calcrl 0.000
Kat7 0.000
p <- lapply(genes_of_interest, function(g) {
  plotTSNE(sce, colour_by = g) +
    ggtitle(g)
})
multiplot(plotlist = p, cols = 4)